import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
pd.options.display.max_columns = 100
import seaborn as sns
from util import *
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
!ls
train = pd.read_csv('../0_Data/train.csv')
test = pd.read_csv('../0_Data/test.csv')
for data in [train, test]:
print(f'~> [{var2str(data).ljust(5)}] has {bg(data.shape[0])} rows, and {bg(data.shape[1])} columns.')
train.head()
test.head()
My assumption here, that the organizers gathered all the data together, then they took the first 1,460 rows to be the training data, and the rest to be the testing data, in other words, the data is not shuffled.
Now, let's take an overview of the training data:
summary(train, 'Missing')
Wow! This summary table tells us a lot, here's the information we got:
Now, let's see the test data:
summary(test, 'Missing')
We can see different things here in test data:
object.Let's see some descriptive statistics for numeric features and non-numeric features, to make the data more intuitive to use.
But first, let's add the training data and test data together.
test.head()
train_ids = train.Id
train_target = train.SalePrice
test_ids = test.Id
train.drop('Id', axis=1, inplace=True)
test.drop('Id', axis=1, inplace=True)
traintest = pd.concat([train, test], axis=0)
traintest.shape
traintest.describe(exclude='number').T
First let's see the shape of the NaN values in the dataset.
plt.figure(figsize=(20, 14))
sns.heatmap(traintest.isnull(), cbar=False);
We can see some patterns here for NaN values, and we can leverage that for feature engineering, for example:
True for the row of that column has missing value, and False if that row has a value.
Alley: Type of alley access to property.BsmtCond: Evaluates the general condition of the basement.# Checking NaN values for each Row.
## Remeber: Number of rows = 2,919
## Remeber: Number of cols = 81
traintest.isna().sum(axis=1).value_counts().head(15)
We can see some pattern here, there are 1,134 rows that each row has 5 NaNs, and so on.
# Checking NaN values for each columns.
## Remeber: Number of rows = 2,919
## Remeber: Number of cols = 81
traintest.isna().sum(axis=0).value_counts().head(15)
Half of the columns have no missing values. And there some of them have a huge amount of missing values, and we saw that in the heatmap.
Let's see the columns that have nulls:
nan_cols = [col for col in train.columns if train[col].isnull().sum() > 0]
nan_cols
traintest.apply(pd.Series.nunique, axis=0).sort_values(0).head()
Since the minimum number of unique values in the columns is 2, there's no column has constant value.
len_dup = traintest.loc[:, traintest.columns.duplicated()].shape[1]
print(f'~> The number of duplicate columns = {bg(len_dup, color="red")}')
to_visual = train[list(train.columns[1:20])+['SalePrice']].dropna()
grid = sns.PairGrid(data=to_visual, size=3, diag_sharey=False)
grid.map_lower(plt.scatter, alpha=.7)
grid.map_diag(plt.hist)
grid.map_upper(sns.kdeplot, cmap=plt.cm.OrRd_r)
plt.suptitle('Distrubtion', size=32, y=1.05);
to_visual = train[list(train.columns[20:40])+['SalePrice']].dropna()
sns.pairplot(to_visual);
to_visual = train[list(train.columns[40:50])+['SalePrice']].dropna()
sns.pairplot(to_visual);
to_visual = train[list(train.columns[50:65])+['SalePrice']].dropna()
sns.pairplot(to_visual);
def figsize(sz1=8, sz2=6):
return plt.figure(figsize=(sz1, sz2))
figsize(16, 8)
to_visual = train[list(train.columns[1:])]
sns.heatmap(to_visual.corr());
Observations:
plt.rcParams['font.size'] = 14
plt.figure(figsize=(14, 6))
sns.distplot(train.SalePrice);
train.SalePrice.describe()
plt.figure(figsize=(14, 6))
color = sns.color_palette()[0]
order = train['MSSubClass'].value_counts().index
ax = sns.countplot(x='MSSubClass',
data=train,
color=color,
order=order,
edgecolor='k')
show_annotation(ax, total=len(train));
plt.figure(figsize=(8, 6))
sns.regplot(data=train, x='MSSubClass', y='SalePrice', x_jitter=.7, y_jitter=.2,
scatter_kws={'alpha': 1/5});
def plot_cat_with_target(var, rotate=False):
figsize(16, 6)
plt.rcParams['font.size'] = 12
target = 'SalePrice'
plt.subplot(131)
ax = train[var].value_counts().plot.bar(edgecolor='k')
show_annotation(ax, 20, len(train))
plt.title(f'Distrubtion of {var}', size=16, y=1.05)
plt.subplot(132)
sns.boxplot(data=train, x=var, y=target)
plt.title(f'Relation b/w {var} & SalePrice', size=16, y=1.05);
plt.xticks(rotation=(90 if rotate else 0))
plt.subplot(133)
sns.swarmplot(data=train, x=var, y=target, color='black', size=3)
plt.title(f'Relation b/w {var} & SalePrice', size=16, y=1.05)
plt.xticks(rotation=(90 if rotate else 0));
plt.subplots_adjust(wspace=.4)
plot_cat_with_target('MSZoning')
figsize(16, 6)
plt.subplot(121)
sns.distplot(train['LotFrontage'].dropna(), kde=False);
plt.title('Historgram of LotFrontage', size=20, y=1.05)
plt.subplot(122)
sns.regplot(data=train, x='LotFrontage', y='SalePrice')
plt.title('Relation b/w LotFrontage & SalePrice', size=20, y=1.05);
# plt.subplots_adjust(wspace=.2);
plot_cat_with_target('Street')
train.Alley.fillna('NaN', inplace=True)
test.Alley.fillna('NaN', inplace=True)
plot_cat_with_target('Alley')
There is a lot of houses that don't have alley to the house.
What I can infer is that, this feature does't tell us much about the price of the house.
plot_cat_with_target('LotShape')
Flatness of the property
plot_cat_with_target('LandContour')
Type of utilities available
plot_cat_with_target('Utilities')
Obervation:
We can see here, that Utilities features, have only one point that's different than AllPub, IMHO, we should remove this feature.
Lot Configuration
plot_cat_with_target('LotConfig')
Slope of property
plot_cat_with_target('LandSlope')
Physical locations within Ames city limits
figsize(30, 6)
plt.rcParams['font.size'] = 12
var = 'Neighborhood'
target = 'SalePrice'
plt.subplot(131)
ax = train[var].value_counts().plot.bar(edgecolor='k')
# show_annotation(ax, 20, len(train))
plt.title(f'Distrubtion of {var}', size=16, y=1.05)
plt.subplot(132)
sns.boxplot(data=train, x=var, y=target)
plt.xticks(rotation=45)
# sns.swarmplot(data=train, x=var, y=target, color='black', size=3)
plt.title(f'Relation b/w {var} & SalePrice', size=16, y=1.05);
plt.subplots_adjust(wspace=.2)
Observation:
I can see the box plot follows a kind of sin wave patter, this might help in the feature engineering.
Proximity to various conditions
plot_cat_with_target('Condition1', rotate=True)
Proximity to various conditions (if more than one is present)
plot_cat_with_target('Condition2', rotate=True)
Type of dwelling
plot_cat_with_target('BldgType', rotate=True)
Observation:
You can see, there 2 points, that we can call them outliers, since they have a price larger enough than the normal.
Interesting Observation:
You can see from these plots and all the above plots, the same 2 points appears in the swarmp plot, this is quite interesting, we wouldn't get this observation, from watching a few plots.
Hence, we should remove them.
Style of dwelling
plot_cat_with_target('HouseStyle', rotate=True)
Rates the overall material and finish of the house
plot_cat_with_target('OverallQual')
Observation:
In the box plot, we see an exponential raising.
When the material is very excellent, the price increases, that's very resonable.
Rates the overall condition of the house
plot_cat_with_target('OverallCond')
Observation:
We see the condition of the house, doesn't increase well when the condition is good, unlike the condition of the material of the house.
Since these 2 features are timedate, we can use them very well in the feature engineering.
figsize(16, 6);
sns.jointplot(data=train, x='YearBuilt', y='SalePrice', joint_kws={'alpha': .5});
We can see a linear relationship b/w YearBuilt and SalePrice.
And the pearson correlation is 0.5.
figsize(20, 6)
sns.jointplot(data=train, x='YearRemodAdd', y='SalePrice', joint_kws={'alpha': .5});
plot_cat_with_target('RoofStyle', rotate=True)
Observation:
Looks like the RoofStyle is not a good parameter to estimate the price of the house, since all the values of the RoofStyle are intertwined.
plot_cat_with_target('RoofMatl', rotate=True)
Observation:
Looks like the RoofMatl tells us something about the sale price, we should consider that.
plot_cat_with_target('Exterior1st', rotate=True)
plot_cat_with_target('Exterior2nd', rotate=True)
train.MasVnrType.fillna('NaN', inplace=True)
test.MasVnrType.fillna('NaN', inplace=True)
plot_cat_with_target('MasVnrType')
figsize(16, 6)
sns.jointplot(data=train, x='MasVnrArea', y='SalePrice', joint_kws={'alpha': .5})
plot_cat_with_target('ExterQual');
Observation:
This feature is very important, since it can distinguish between distinct values for the sale price.
plot_cat_with_target('ExterCond', rotate=True)
plot_cat_with_target('Foundation')
train.BsmtQual.fillna('NaN', inplace=True)
test.BsmtQual.fillna('NaN', inplace=True)
plot_cat_with_target('BsmtQual')
Observation:
As we can see, as the quality of the basement increases, which cause the price of the house to increase.
train.BsmtCond.fillna('NaN', inplace=True)
test.BsmtCond.fillna('NaN', inplace=True)
plot_cat_with_target('BsmtCond')
Observation:
As we can see, as the condition of the basement increases, which cause the price of the house to increase, and vice versa.
train.BsmtExposure.fillna('NaN', inplace=True)
test.BsmtExposure.fillna('NaN', inplace=True)
plot_cat_with_target('BsmtExposure')
train.BsmtFinType1.fillna('NaN', inplace=True)
test.BsmtFinType1.fillna('NaN', inplace=True)
plot_cat_with_target('BsmtFinType1')
figsize()
sns.jointplot(data=train, x='BsmtFinSF1', y='SalePrice', kind='reg');
train.BsmtFinType2.fillna('NaN', inplace=True)
test.BsmtFinType2.fillna('NaN', inplace=True)
plot_cat_with_target('BsmtFinType2')
figsize()
sns.jointplot(data=train, x='BsmtFinSF2', y='SalePrice', kind='reg');
figsize()
sns.jointplot(data=train, x='BsmtUnfSF', y='SalePrice', kind='reg');
figsize()
sns.jointplot(data=train, x='TotalBsmtSF', y='SalePrice', kind='reg');
plot_cat_with_target('Heating')
plot_cat_with_target('HeatingQC')
plot_cat_with_target('CentralAir')
Observation:
As we can see this is important feature.
plot_cat_with_target('Electrical')
figsize()
sns.jointplot(data=train, x='1stFlrSF', y='SalePrice', kind='reg');
sns.jointplot(data=train, x='2ndFlrSF', y='SalePrice', kind='reg');
sns.jointplot(data=train, x='LowQualFinSF', y='SalePrice', kind='reg');
plot_cat_with_target('BsmtFullBath')
Observation:
As the number of bathrooms increases, the price of the house increases as well.
plot_cat_with_target('BsmtHalfBath')
sns.jointplot(data=train, x='GrLivArea', y='SalePrice', kind='reg');
plot_cat_with_target('FullBath')
plot_cat_with_target('HalfBath')
plot_cat_with_target('KitchenQual')
plot_cat_with_target('TotRmsAbvGrd')
plot_cat_with_target('Functional')
plot_cat_with_target('Fireplaces')
train.FireplaceQu.fillna('NaN', inplace=True)
test.FireplaceQu.fillna('NaN', inplace=True)
plot_cat_with_target('FireplaceQu')
for dataset in [train, test]:
dataset.fillna('NaN', inplace=True)
plot_cat_with_target('GarageType')
train.GarageYrBlt = train.GarageYrBlt.astype(np.float32)
sns.jointplot(data=train, x='GarageYrBlt', y='SalePrice', kind='reg');
train.GarageFinish.fillna('NaN', inplace=True)
test.GarageFinish.fillna('NaN', inplace=True)
plot_cat_with_target('GarageFinish')
plot_cat_with_target('GarageCars')
sns.jointplot(data=train, x='GarageArea', y='SalePrice', kind='reg')
for dataset in [train, test]:
dataset.fillna('NaN', inplace=True)
plot_cat_with_target('GarageQual')
for dataset in [train, test]:
dataset.fillna('NaN', inplace=True)
plot_cat_with_target('GarageCond')
plot_cat_with_target('PavedDrive')
sns.jointplot('WoodDeckSF', 'SalePrice', train)
sns.jointplot('OpenPorchSF', 'SalePrice', train)
sns.jointplot('EnclosedPorch', 'SalePrice', train);
sns.jointplot('3SsnPorch', 'SalePrice', train);
sns.jointplot('ScreenPorch', 'SalePrice', train);
sns.jointplot('PoolArea', 'SalePrice', train);
train.PoolQC.fillna('NaN', inplace=True)
test.PoolQC.fillna('NaN', inplace=True)
plot_cat_with_target('PoolQC')
train.Fence.fillna('NaN', inplace=True)
test.Fence.fillna('NaN', inplace=True)
plot_cat_with_target('Fence')
train.MiscFeature.fillna('NaN', inplace=True)
test.MiscFeature.fillna('NaN', inplace=True)
plot_cat_with_target('MiscFeature')
sns.jointplot('MiscVal', 'SalePrice', train);
plot_cat_with_target('MoSold')
plot_cat_with_target('YrSold')
plot_cat_with_target('SaleType')
plot_cat_with_target('SaleCondition', rotate=True)
plot_cat_with_target('BedroomAbvGr', rotate=True)
plot_cat_with_target('KitchenAbvGr', rotate=True)
Over All Observation:
- There are some features that have dominant values, and the rest of the values are minorities.
- The quality of things matters in terms of price.
ord_cols = ['LandSlope', 'OverallQual', 'OverallCond', 'ExterQual', 'ExterCond', 'BsmtExposure',
'BsmtFinType1', 'BsmtCond', 'BsmtQual', 'BsmtFinType2', 'HeatingQC', 'Electrical',
'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'KitchenQual', 'TotRmsAbvGrd', 'Functional',
'Fireplaces', 'FireplaceQu', 'GarageFinish', 'GarageCars', 'GarageQual', 'GarageCond', 'PavedDrive',
'PoolQC', 'Fence', 'BedroomAbvGr', 'KitchenAbvGr']
cat_cols = ['MSSubClass', 'MSZoning', 'Alley', 'LotShape', 'LandContour', 'LotConfig',
'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'RoofStyle', 'RoofMatl', 'Exterior1st',
'Exterior2nd', 'MasVnrType', 'Foundation', 'Heating', 'GarageType', 'MiscFeature', 'SaleType',
'SaleCondition', 'HouseStyle']
num_cols = ['LotFrontage', 'LotArea', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF',
'1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF',
'EnclosedPorch', '3SsnPorch', 'ScreenPorch']
date_cols = ['YearBuilt', 'YearRemodAdd', 'GarageYrBlt', 'MoSold', 'YrSold']
to_drop = ['Utilities', 'Street', 'PoolArea', 'MiscVal']
bin_cols = ['CentralAir']
## Let's check if the number of columns in above lists are equal to the number of colums in train dataset.
print(f'~> Number of train columns: {bg(len(train.columns)-2)}')
print(f'~> Number of observed columns: {bg(len(ord_cols+cat_cols+num_cols+date_cols+to_drop+bin_cols))}')
print(f'> They are equal: {len(ord_cols+cat_cols+num_cols+date_cols+to_drop+bin_cols) == len(train.columns)-2}')
# Check for columns that are not included in our lists.
for col in train:
if (col not in ord_cols) and (col not in cat_cols) and (col not in num_cols) and (col not in date_cols) and (col not in to_drop) and (col not in bin_cols):
if col not in ['Id', 'SalePrice']:
print(col)
print('All Columns are included!')
[col for col in train.columns if train[col].isnull().sum() > 0]
plt.figure(figsize=(20, 14))
sns.heatmap(train.isnull(), cbar=False);
train.LotFrontage = train.LotFrontage.astype(np.float)
train.LotFrontage.describe()
## Number of rows has null value in `LotFronage`
train.LotFrontage.isnull().shape[0], test.LotFrontage.isnull().shape[0]
# Fill the missing value with the median.
train.fillna(train.median(), inplace=True)
test.fillna(test.median(), inplace=True)
figsize(16, 6)
plt.subplot(121)
sns.distplot(train['LotFrontage'], kde=False);
plt.title('Historgram of LotFrontage', size=20, y=1.05)
plt.subplot(122)
sns.regplot(data=train, x='LotFrontage', y='SalePrice')
plt.title('Relation b/w LotFrontage & SalePrice', size=20, y=1.05);
## Sanity Check
train.LotFrontage.isna().sum(), test.LotFrontage.isna().sum()
## Number of rows has null value in that column
train.Electrical.isna().sum(), test.Electrical.isna().sum()
train.Electrical.fillna('SBrkr', inplace=True)
## Sanity Check
train.Electrical.isna().sum(), test.Electrical.isna().sum()
plt.figure(figsize=(20, 14))
sns.heatmap(train.isnull(), cbar=False);
train.isnull().sum().sum(), test.isnull().sum().sum()
train['Id'] = train_ids
train['SalePrice'] = train_target
train.to_hdf('train_null_removed.h5', 'data', mode='w')
test.to_hdf('test_null_removed.h5', 'data', mode='w')